reading processed data by CarDEC
adata=ad$read_h5ad("../CarDEC Results/adata_CarDEC.h5ad")
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann0=py_to_r(adata$var)
mtx=t(py_to_r(adata$layers['denoised counts']))
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann0)
mtx_sizefactor=1e4/colSums(mtx)
obj1=CreateSeuratObject(mtx,meta.data = cell.meta.data)
obj1=NormalizeData(obj1,verbose = F)
Idents(obj1)="BatchID" #obj1 means denoised count by CarDEC
avg_exp=Seurat:::FastExpMean(obj1@assays$RNA@counts,display_progress = F)#log1p(
gene_ann_all=data.frame(gene_short_name = make.unique(rownames(gene_ann0)),
VarianceType=gene_ann0$`Variance Type`,
avg_exp=avg_exp,
row.names = make.unique(rownames(gene_ann0)))
#assign pseudotime for obj0
cds=readRDS("./cds_cardec.rds") #csd_cardec is the monocle's output from CarDEC
xtmp=colData(cds)[,c("Pseudotime")]%>%as.data.frame()
obj0$Pseudotime=colData(cds)$Pseudotime
obj1$Pseudotime=colData(cds)$Pseudotime
obj0=NormalizeData(obj0,verbose = F)
obj0=FindVariableFeatures(obj0,verbose = F)
xx=obj0@assays$RNA@meta.features[,c("vst.mean"),drop=F]
xx$gene=rownames(xx)
We obtained TFs related to monocytes from Figure 4 in Gene expression profiling reveals the defining features of the classical, intermediate, and nonclassical human monocyte subsets
tf_df=openxlsx::read.xlsx("./TF_blood.xlsx")%>%filter(Gene%in%rownames(obj0))%>%
left_join(gene_ann_all,by=c("Gene"="gene_short_name"))%>%
left_join(xx,by=c("Gene"="gene"))%>%
as.data.frame()
Column `Gene`/`gene_short_name` joining character vector and factor, coercing into character vector
rownames(tf_df)=tf_df$Gene
tf_df
num_cells_expressed=Matrix::rowSums(obj0@assays$RNA@counts[tf_df$Gene,]!=0)
tf_df$num_cells_expressed=num_cells_expressed[rownames(tf_df)]
Heatmap-no smoothed
Note: genes with color blue means LVGs and genes with red means HVGs. We firstly filtered out genes with average expression lower than 0.1 and sorted the genes decreasingly by average expression grouped by in each gene class (I set 3 clusters for gene, but didn’t show here) and highlighted the top 15 genes.
pseudotime from cardec’s denoised gene expression
obj1=NormalizeData(obj1,verbose = F)
obj1=FindVariableFeatures(obj1,verbose = F)
hvf.info=obj1@assays$RNA@meta.features
gene22=hvf.info[order(hvf.info$vst.variance.standardized, decreasing = TRUE), ,drop = FALSE]
num_cells_expressed=Matrix::rowSums(obj1@assays$RNA@counts[tf_df$Gene,]!=0)
obj1=ScaleData(obj1,features = tf_df$Gene,verbose = F)
#scaled
#m = t(scale(m1,center = T))
m=t(FetchData(obj1,vars = tf_df$Gene,slot = "scale.data"))
m[is.nan(m)] = 0
m[m >= 3] = 3
m[m <= -3] = -3
heatmap_matrix <- m
row_dist <- as.dist((1 - cor(t(heatmap_matrix)))/2)
res2 <- list(ph=pheatmap::pheatmap(heatmap_matrix, useRaster = T, cluster_cols = FALSE,
cluster_rows = T, show_rownames = F, show_colnames = F,
clustering_distance_rows = row_dist, clustering_method = "ward.D2",
cutree_rows = 3, silent = TRUE, filename = NA),m=heatmap_matrix)
z=res2[[2]][res2[[1]]$tree_row$order,]
Cluster_labels=cutree(res2[[1]]$tree_row,3)[res2[[1]]$tree_row$order]
row_anno=data.frame(genename=rownames(z),
Cluster=Cluster_labels,
Class=tf_df[rownames(z),"Gene"],
VarianceType=tf_df[rownames(z),"VarianceType"],
GeneType=tf_df[rownames(z),"GeneType"],
#avg_exp=tf_df[rownames(z),"avg_exp"],
vst.mean=gene22[rownames(z),"vst.mean"],
vst.variance.standard=gene22[rownames(z),"vst.variance.standardized"],
num_cells_expressed=num_cells_expressed[rownames(z)],
row.names = rownames(z),stringsAsFactors = F)
row_anno$GeneClass=plyr::mapvalues(row_anno$Cluster,from=sort(unique(row_anno$Cluster)),
to=paste0("Module",as.numeric(factor(sort(unique(Cluster_labels))))))
#order_id0=order(row_anno$Cluster,row_anno$GeneType,row_anno$VarianceType)
#z=z[order_id0,]
#row_anno=row_anno[order_id0,]
#GeneClass.color=gg_color_hue(length(unique(Cluster_labels)))
GeneClass.color=RColorBrewer::brewer.pal(3,"Dark2")
names(GeneClass.color)=paste0("Module",as.numeric(factor(sort(unique(Cluster_labels)))))
# gene Module color
row_anno$color=plyr::mapvalues(row_anno$GeneClass,from=names(GeneClass.color),
to=GeneClass.color)
row_anno$HVG_color=plyr::mapvalues(as.character(row_anno$VarianceType),from=c("HVG","LVG"),
to=c("#E41A1C","#377EB8"))
rownames(row_anno)=row_anno$genename
pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(monocle:::blue2green2red(150))(150))
# columan annotation
#col_anno=data.frame(pseudotime=seq(0,1,length=ncol(z)),
# row.names = as.character(1:ncol(z)))
col_anno=data.frame(BatchID=plyr::mapvalues(obj1$BatchID,c("MH001","RP002","RP009"),c("T1","T2","T3")),
Pseudotime=obj1$Pseudotime,
row.names = colnames(obj1))
col_id=order(col_anno$Pseudotime)
z=z[,col_id]
col_anno=col_anno[col_id,]
#
set.seed(10)
row_anno.tmp=row_anno%>%
filter(vst.mean>0.1)%>%
group_by(VarianceType)%>%
arrange(desc(vst.mean),.by_group = TRUE)%>%
mutate(n_order=1:n())%>%as.data.frame()
#id.select=order(row_anno$qval,decreasing = F)[1:30]
id.select=which(row_anno$genename%in%row_anno.tmp$genename[row_anno.tmp$n_order%in%c(1:20)])
row_anno.new=row_anno[id.select,]
col_fun = circlize::colorRamp2(seq(-3, 3,length=200), colorRampPalette(c("blue", "white", "red"))(200))
#pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(c("darkblue","black","#FFFF00"))(150))
pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(monocle:::blue2green2red(150))(150))
top_anno=HeatmapAnnotation( Pseudotime = anno_simple(col_anno$Pseudotime, col = pseudotime_col_fun),
BatchID=col_anno$BatchID,
col=list(BatchID=c("T1"="#66C2A5","T2"="#FC8D62","T3"="#8DA0CB")),
show_legend = c(T,T),
annotation_name_side = "right")
left_anno=rowAnnotation(#GeneClass=row_anno$GeneClass,
#GeneType=row_anno$GeneType,
VarianceType=row_anno$VarianceType,
#Nonzero=anno_barplot(log10(row_anno$num_cells_expressed),
# gp = gpar(fill ="#00C4FF",col=NA),
# bar_width = 1,
# height = unit(2.5, "cm")),
annotation_name_rot=90,
annotation_name_side = "top",
#col=list(GeneClass=celltype_color[names(celltype_color)%in%unique(tmp.anno$celltype)]),
col=list(GeneClass=GeneClass.color,
VarianceType=c("HVG"="#E41A1C","LVG"="#377EB8")),
show_annotation_name=c(F,T),
show_legend=c(T,T))
p1=ComplexHeatmap::Heatmap(z, name = "scaled.expression",
cluster_rows = F,
cluster_columns = F,
col=col_fun,
column_labels = rep("",length=ncol(z)),
row_labels = rep("",length=nrow(z)),
#row_labels = gene.use.df$gene,
row_names_side = "right",
#the following fontsize is nonusefull for this situattion
row_names_gp = gpar(fontsize=5),
column_names_gp = gpar(fontsize = 8),
#column_gap = unit(0.5, "mm"),
#column_split = tmp.anno$res,
#column_title = "%s",
#column_title_rot=90,
#row_gap = unit(0.5, "mm"),
#row_split = row_anno$GeneClass,
#row_title = "%s",
row_title_gp = gpar(fontsize = 12),
row_title_rot = 90,
top_annotation = top_anno,
left_annotation= left_anno,
heatmap_legend_param=list(legend_direction="horizontal",legend_width = unit(2.5, "cm")),
right_annotation = rowAnnotation(foo=anno_mark(at=id.select,
side="right",
extend=unit(0.2,"cm"),
labels_gp = gpar(fontsize=8,col=row_anno$HVG_color[id.select]),
labels=row_anno.new$genename),
annotation_name_side="top"))
#draw(p1,padding=unit(c(2,2,2,2),"mm"))
lgd_pse=Legend(title = "Pseudotime", col = pseudotime_col_fun, at = c(0,0.5,1),
labels = c("low","med","high"),legend_height = unit(2.5, "cm"),border = NA,title_position = "topcenter",direction="horizontal")
draw(p1,annotation_legend_list = list(lgd_pse),merge_legends=T,heatmap_legend_side="bottom")

pseudotime from raw gene expression
cds=readRDS("./cds_raw.rds")
print(sum(colnames(obj0)==colnames(cds)))
[1] 10878
#10878
obj0$Pseudotime_raw=colData(cds)$Pseudotime
obj0=NormalizeData(obj0,verbose = F)
obj0=FindVariableFeatures(obj0,verbose = F)
hvf.info=obj0@assays$RNA@meta.features
gene22=hvf.info[order(hvf.info$vst.variance.standardized, decreasing = TRUE), ,drop = FALSE]
num_cells_expressed=Matrix::rowSums(obj0@assays$RNA@counts[tf_df$Gene,]!=0)
obj0=ScaleData(obj0,features = tf_df$Gene,verbose = F)
#scaled
#m = t(scale(m1,center = T))
m=t(FetchData(obj0,vars = tf_df$Gene,slot = "scale.data"))
m[is.nan(m)] = 0
m[m >= 3] = 3
m[m <= -3] = -3
heatmap_matrix <- m
row_dist <- as.dist((1 - cor(t(heatmap_matrix)))/2)
res2 <- list(ph=pheatmap::pheatmap(heatmap_matrix, useRaster = T, cluster_cols = FALSE,
cluster_rows = T, show_rownames = F, show_colnames = F,
clustering_distance_rows = row_dist, clustering_method = "ward.D2",
cutree_rows = 3, silent = TRUE, filename = NA),m=heatmap_matrix)
z=res2[[2]][res2[[1]]$tree_row$order,]
Cluster_labels=cutree(res2[[1]]$tree_row,3)[res2[[1]]$tree_row$order]
row_anno=data.frame(genename=rownames(z),
Cluster=Cluster_labels,
Class=tf_df[rownames(z),"Gene"],
VarianceType=tf_df[rownames(z),"VarianceType"],
GeneType=tf_df[rownames(z),"GeneType"],
vst.mean=gene22[rownames(z),"vst.mean"],
vst.variance.standard=gene22[rownames(z),"vst.variance.standardized"],
num_cells_expressed=num_cells_expressed[rownames(z)],
row.names = rownames(z),stringsAsFactors = F)
row_anno$GeneClass=plyr::mapvalues(row_anno$Cluster,from=sort(unique(row_anno$Cluster)),
to=paste0("Module",as.numeric(factor(sort(unique(Cluster_labels))))))
#order_id0=order(row_anno$Cluster,row_anno$GeneType,row_anno$VarianceType)
#z=z[order_id0,]
#row_anno=row_anno[order_id0,]
#GeneClass.color=gg_color_hue(length(unique(Cluster_labels)))
GeneClass.color=RColorBrewer::brewer.pal(3,"Dark2")
names(GeneClass.color)=paste0("Module",as.numeric(factor(sort(unique(Cluster_labels)))))
# gene Module color
row_anno$color=plyr::mapvalues(row_anno$GeneClass,from=names(GeneClass.color),
to=GeneClass.color)
row_anno$HVG_color=plyr::mapvalues(as.character(row_anno$VarianceType),from=c("HVG","LVG"),
to=c("#E41A1C","#377EB8"))
rownames(row_anno)=row_anno$genename
pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(monocle:::blue2green2red(150))(150))
# columan annotation
#col_anno=data.frame(pseudotime=seq(0,1,length=ncol(z)),
# row.names = as.character(1:ncol(z)))
col_anno=data.frame(BatchID=plyr::mapvalues(obj0$BatchID,c("MH001","RP002","RP009"),c("T1","T2","T3")),
Pseudotime=obj0$Pseudotime_raw,
#Pseudotime=obj0$Pseudotime,
row.names = colnames(obj0))
col_id=order(col_anno$Pseudotime)
z=z[,col_id]
col_anno=col_anno[col_id,]
#
set.seed(10)
row_anno.tmp=row_anno%>%
filter(vst.mean>0.1)%>%
group_by(VarianceType)%>%
arrange(desc(vst.mean),.by_group = TRUE)%>%
mutate(n_order=1:n())%>%as.data.frame()
#id.select=order(row_anno$qval,decreasing = F)[1:30]
id.select=which(row_anno$genename%in%row_anno.tmp$genename[row_anno.tmp$n_order%in%c(1:20)])
row_anno.new=row_anno[id.select,]
col_fun = circlize::colorRamp2(seq(-3, 3,length=200), colorRampPalette(c("blue", "white", "red"))(200))
#pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(c("darkblue","black","#FFFF00"))(150))
pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(monocle:::blue2green2red(150))(150))
top_anno=HeatmapAnnotation( Pseudotime = anno_simple(col_anno$Pseudotime, col = pseudotime_col_fun),
BatchID=col_anno$BatchID,
col=list(BatchID=c("T1"="#66C2A5","T2"="#FC8D62","T3"="#8DA0CB")),
show_legend = c(T,T),
annotation_name_side = "right")
left_anno=rowAnnotation(#GeneClass=row_anno$GeneClass,
#GeneType=row_anno$GeneType,
VarianceType=row_anno$VarianceType,
#Nonzero=anno_barplot(log10(row_anno$num_cells_expressed),
# gp = gpar(fill ="#00C4FF",col=NA),
# bar_width = 1,
# height = unit(2.5, "cm")),
annotation_name_rot=90,
annotation_name_side = "top",
#col=list(GeneClass=celltype_color[names(celltype_color)%in%unique(tmp.anno$celltype)]),
col=list(GeneClass=GeneClass.color,
VarianceType=c("HVG"="#E41A1C","LVG"="#377EB8")),
show_annotation_name=c(F,T),
show_legend=c(T,T))
p2=ComplexHeatmap::Heatmap(z, name = "scaled.expression",
cluster_rows = F,
cluster_columns = F,
col=col_fun,
column_labels = rep("",length=ncol(z)),
row_labels = rep("",length=nrow(z)),
#row_labels = gene.use.df$gene,
row_names_side = "right",
#the following fontsize is nonusefull for this situattion
row_names_gp = gpar(fontsize=5),
column_names_gp = gpar(fontsize = 8),
#column_gap = unit(0.5, "mm"),
#column_split = tmp.anno$res,
#column_title = "%s",
#column_title_rot=90,
#row_split =gene.use.df$cluster,
row_gap = unit(0.5, "mm"),
#row_split = row_anno$GeneClass,
#row_title = "%s",
row_title_gp = gpar(fontsize = 12),
row_title_rot = 90,
top_annotation = top_anno,
left_annotation= left_anno,
heatmap_legend_param=list(legend_direction="horizontal",legend_width = unit(2.5, "cm")),
right_annotation = rowAnnotation(foo=anno_mark(at=id.select,
side="right",
extend=unit(0.2,"cm"),
labels_gp = gpar(fontsize=8,col=row_anno$HVG_color[id.select]),
labels=row_anno.new$genename),
annotation_name_side="top"))
#draw(p1,padding=unit(c(2,2,2,2),"mm"))
lgd_pse=Legend(title = "Pseudotime", col = pseudotime_col_fun, at = c(0,0.5,1),
labels = c("low","med","high"),legend_height = unit(2.5, "cm"),border = NA,title_position = "topcenter",direction="horizontal")
draw(p2,annotation_legend_list = list(lgd_pse),merge_legends=T,heatmap_legend_side="bottom")

reading results from other methods
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad("../../dca_test.h5ad")
obj_raw=Convert_to_seurat3(adata)
obj_raw=NormalizeData(obj_raw,verbose = F)
obj_raw=FindVariableFeatures(obj_raw,verbose = F)
xx_vst.mean=obj_raw@assays$RNA@meta.features[,c("vst.mean"),drop=F]
xx_vst.mean$gene=rownames(xx_vst.mean)
raw.data=obj_raw@assays$RNA@counts
maprules=c("2017_0801"="T1","2017_1017"="T2","2017_1120"="T3")
maprules
2017_0801 2017_1017 2017_1120
"T1" "T2" "T3"
Methods_color=c("#E41A1C","#377EB8","#4DAF4A","#984EA3","#FF7F00","#00cc99")
names(Methods_color)=c("scVI","CarDEC","DCA","MNN","Raw","Scanorama")
op=par(mar=c(5,4,6,4))
image(1:length(Methods_color),1, as.matrix(1:length(Methods_color)),col=Methods_color,xlab = "", ylab = "")
axis(3,at=seq(1:length(Methods_color)),labels=Methods_color,las=2,lwd=0)
par(op)

adata=ad$read_h5ad("../CarDEC Results/adata_CarDEC.h5ad")
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann_cardec=py_to_r(adata$var)
mtx=t(py_to_r(adata$layers['denoised counts']))
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann_cardec)
mtx_sizefactor=1e4/colSums(mtx)
obj_cardec=CreateSeuratObject(mtx,meta.data = cell.meta.data)
Invalid name supplied, making object name syntactically valid. New object name is batch_labelcellnamedataset_batchdataset_labelstatus_labeln_genesn_countspercent_mitoBatchIDsize.factorsbatch; see ?make.names for more details on syntax validity
obj_cardec=NormalizeData(obj_cardec,verbose = F)
Idents(obj_cardec)="BatchID" #obj1 means denoised count by CarDEC
suppressPackageStartupMessages(library(monocle3))
cds=readRDS("./cds_cardec.rds")
obj_cardec$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
obj_raw$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
rm(cds)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 8292993 442.9 14413331 769.8 14413331 769.8
Vcells 994980592 7591.1 1631060878 12444.1 1510425451 11523.7
cds=readRDS("./cds_raw.rds")
obj_raw$Pseudotime_raw=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
DCA+combat
adata=ad$read_h5ad("../final_processed_results/dca Results New/adata_all.h5ad")
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann0=py_to_r(adata$var)
gene_ann=data.frame(gene_short_name = make.unique(rownames(gene_ann0)),
row.names = make.unique(rownames(gene_ann0)))
mtx=t(py_to_r(adata$X))
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann)
mtx_sizefactor=1e4/colSums(mtx)
obj_dca=CreateSeuratObject(mtx,meta.data=cell.meta.data)
#obj_dca=NormalizeData(object = obj_dca,verbose = F)
cds=readRDS("./cds_dca.rds")
obj_dca$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
scVI
adata=ad$read_h5ad("../final_processed_results/scVI Results New/monocytes_ALL/adata_all.h5ad")
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann0=py_to_r(adata$var)
gene_ann=data.frame(gene_short_name = make.unique(rownames(gene_ann0)),
row.names = make.unique(rownames(gene_ann0)))
mtx=t(py_to_r(adata$X))
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann)
mtx_sizefactor=1e4/colSums(mtx)
obj_scvi=CreateSeuratObject(mtx,meta.data=cell.meta.data)
Invalid name supplied, making object name syntactically valid. New object name is batch_labelcellnamedataset_batchdataset_labelstatus_labeln_genesn_countspercent_mitoBatchIDBatchID_encodeX_scvi_batchX_scvi_labelsX_scvi_local_l_meanX_scvi_local_l_varlouvain_denoisedlouvain_latent; see ?make.names for more details on syntax validity
#obj_scvi=NormalizeData(object = obj_scvi,verbose = F)
cds=readRDS("./cds_scvi.rds")
obj_scvi$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm = T)
MNN
output=readRDS("../final_processed_results/MNN_corrected_all.rds")
mtx=output@assays$data$corrected
cell.meta.data=colData(output)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(mtx)),
row.names = make.unique(rownames(mtx)))
obj_mnn=CreateSeuratObject(mtx,meta.data=as.data.frame(cell.meta.data))
cds=readRDS("./cds_mnn.rds")
obj_mnn$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
scanorama
#add when revised in Genome Research
adata=ad$read_h5ad("../final_processed_results/scanorama Results/adata_ALL.h5ad")#
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann0=py_to_r(adata$raw$var)
gene_ann=data.frame(gene_short_name = make.unique(rownames(gene_ann0)),
row.names = make.unique(rownames(gene_ann0)))
mtx=t(py_to_r(adata$raw$X$tocsc()))#
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann)
obj_scanorama=CreateSeuratObject(mtx,meta.data=as.data.frame(cell.meta.data))
#obj_scanorama=NormalizeData(obj_scanorama,verbose = F)
cds=readRDS("./cds_scanorama.rds")
obj_scanorama$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
avg_exp=log1p(Seurat:::FastExpMean(obj_cardec@assays$RNA@counts,display_progress = F))
gene_ann_all=data.frame(gene_short_name = make.unique(rownames(obj_cardec)),
VarianceType=gene_ann_cardec$`Variance Type`,
avg_exp=avg_exp,
row.names = make.unique(rownames(obj_cardec)))
tf_df=openxlsx::read.xlsx("./TF_blood.xlsx")%>%filter(Gene%in%rownames(obj_raw))%>%
left_join(gene_ann_all,by=c("Gene"="gene_short_name"))%>%
left_join(xx_vst.mean,by=c("Gene"="gene"))%>%
as.data.frame()
Column `Gene`/`gene_short_name` joining character vector and factor, coercing into character vector
rownames(tf_df)=tf_df$Gene
DT::datatable(tf_df)
After filtering out gene expression <0.05, we have
tf_df=subset(tf_df,tf_df$avg_exp>=0.05 )
DT::datatable(tf_df)
#write.table(tf_df[,c(1,2,3)],file = "tmp.csv",sep=",",col.names = T,row.names = F)
get_raw_denoised_df_new=function(obj,gene){
#df0=data.frame(cbind(pseudotime=obj0$Pseudotime,FetchData(obj0,vars=gene)))
df0=FetchData(object = obj,vars=gene)
df0$pseudotime=obj$Pseudotime
df0$Pseudotime_01=obj$Pseudotime/max(obj$Pseudotime,na.rm = T)# for plot
#colnames(df0)[2:ncol(df0)]=gsub(pattern = "-",replacement = "",x =gene )
df0$BatchID=plyr::mapvalues(obj$batch_label,names(maprules),maprules)
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
return(df0)
}
suppressPackageStartupMessages(library(mgcv))
get_pval_pair_new=function(df00,gene,return_pval_table=T){
df00_tmp=df00
id_gene=which(colnames(df00_tmp)==gene)
gene_new=gsub("-|\\.","",gene)
colnames(df00_tmp)[id_gene]=gene_new
formula1=as.formula(paste0(gene_new,"~","s(pseudotime,bs=\"cs\")"))
formula2=as.formula(paste0(gene_new,"~BatchID+","s(pseudotime,bs=\"cs\")"))
mod1 <- gam(formula1, data = df00_tmp, select = F)
mod2 <- gam(formula2, data = df00_tmp, select = F)
m1=anova(mod1, mod2, test = "Chisq")
m2=anova(mod1, mod2, test = "F")
unique_batch=sort(as.character(unique(df00_tmp$BatchID)),decreasing = T)
res_pairwise=lapply(unique_batch,function(x){
df0tmp=df00_tmp[df00_tmp$BatchID!=x,,drop=F]
mod1 <- gam(formula1, data = df0tmp, select = F)
mod2 <- gam(formula2, data = df0tmp, select = F)
m1=anova(mod1, mod2, test = "Chisq")
m2=anova(mod1, mod2, test = "F")
return(list(mod1=mod1,mod2=mod2,m1=m1,m2=m2))
})
names(res_pairwise)=unique_batch
if(return_pval_table){
col_names=c("gene",paste0(rep(c("T1 v.s. T2", "T1 v.s. T3","T2 v.s. T3"),times=2),
rep(c(" (Chisq)"," (F)"),each=3)),
"Overall (Chisq)", "Overall (F)"
)
res0=data.frame(matrix(NA,ncol=length(col_names),nrow=1))
colnames(res0)=col_names
res0[1,1]=gene
res0[1,2]=res_pairwise$T3$m1$`Pr(>Chi)`[2]
res0[1,3]=res_pairwise$T2$m1$`Pr(>Chi)`[2]
res0[1,4]=res_pairwise$T1$m1$`Pr(>Chi)`[2]
res0[1,5]=res_pairwise$T3$m2$`Pr(>F)`[2]
res0[1,6]=res_pairwise$T2$m2$`Pr(>F)`[2]
res0[1,7]=res_pairwise$T1$m2$`Pr(>F)`[2]
res0[1,8]=m1$`Pr(>Chi)`[2]
res0[1,9]=m2$`Pr(>F)`[2]
return(res0)
}else{
return(list(single=list(mod1=mod1,mod2=mod2,m1=m1,m2=m2),
pariwise=res_pairwise))
}
}
res_pval_list=readRDS("./res_pval_new_revised.rds")
Final Plots
In this Section, I reported the respective pseudotime identified by compared methods, including MNN, scVI, DCA and scanorama
Feature plots
old=theme_set(theme_bw()+theme(strip.background = element_rect(fill="white"),
panel.background = element_blank(),
legend.background = element_blank(),
panel.grid =element_blank()))
df0_tmp0=subset(df_plot, variable=="Overall (Chisq)" &!Group %in%"Raw")
ggtitle0=c("CarDEC"="obj_cardec","DCA"="obj_dca","MNN"="obj_mnn","Raw"="obj_raw","scVI"="obj_scvi","Scanorama"="obj_scanorama")
get_plot1=function(df00,gene="S100A8",title0="CarDEC"){
p_val_label=df0_tmp0$pval[df0_tmp0$gene==gene&df0_tmp0$Group==title0]
p=ggplot(data =df00,aes_string(x="pseudotime",y=gene))+
geom_point(aes(color=BatchID),size=0.01)+
guides(color=guide_legend(override.aes = list(size=5)))+
geom_smooth(aes(color=BatchID),method="gam",formula = y ~ s(x, bs="cs"))+
geom_smooth(color="black",method="gam",formula = y ~ s(x, bs="cs"),size=0.5)+
ylab(paste0(gene," (",as.character(tf_df$VarianceType[tf_df$Gene==gene]),")"))+
ggtitle(title0,subtitle = paste0("p-value=",ifelse(length(p_val_label)!=0,scales::scientific(p_val_label),"NA")))+
xlab("Pseudotime")+theme(legend.position = "right",
plot.title = element_text(size=18,face="bold",hjust=0.5),
legend.text = element_text(size=15,face="bold"),
axis.title.y = element_text(color=ifelse(as.character(tf_df$VarianceType[tf_df$Gene==gene])=="HVG","red","blue")),
legend.title = element_blank())+scale_color_brewer(palette = "Set2")
return(p)
}
plist_0=lapply(sort(tf_df$Gene),function(gene0){
p1=get_plot1(df0_list$obj_cardec,gene = gene0,title0 = "CarDEC")+theme(legend.position = "none")
p2=get_plot1(df0_list$obj_dca,gene = gene0,title0 = "DCA")+theme(legend.position = "none")
p3=get_plot1(df0_list$obj_mnn,gene = gene0,title0 = "MNN")
p4=get_plot1(df0_list$obj_scvi,gene = gene0,title0 = "scVI")+theme(legend.position = "none")
p5=get_plot1(df0_list$obj_scanorama,gene = gene0,title0 = "Scanorama")+theme(legend.position = "none")
return(list(p1,p5,p2,p4,p3))
})
plist_1=lapply(1:length(plist_0),function(x) egg::ggarrange(plots = plist_0[[x]],nrow = 1,draw = F))
id0=c(23,22,25,19)
plot_grid(plist_1[[23]],plist_1[[22]],plist_1[[25]],plist_1[[19]],align = "v",ncol = 1)

plist_00=lapply(sort(tf_df$Gene)[c(23,22,25,19)],function(gene0){
p1=get_plot1(df0_list$obj_cardec,gene = gene0,title0 = "CarDEC")+theme(legend.position = "none")+
theme(plot.title = element_blank(),plot.margin = unit(c(0.2,0,0.5,0.2),"cm"))
p2=get_plot1(df0_list$obj_dca,gene = gene0,title0 = "DCA")+theme(legend.position = "none")+theme(plot.title = element_blank())
p3=get_plot1(df0_list$obj_mnn,gene = gene0,title0 = "MNN")+theme(plot.title = element_blank())
p4=get_plot1(df0_list$obj_scvi,gene = gene0,title0 = "scVI")+theme(legend.position = "none")+theme(plot.title = element_blank())
p5=get_plot1(df0_list$obj_scanorama,gene = gene0,title0 = "Scanorama")+theme(legend.position = "none")+theme(plot.title = element_blank())
return(list(p1,p5,p2,p4,p3))
})
pp0=egg::ggarrange(plots = c(plist_00[[1]],plist_00[[2]],plist_00[[3]],plist_00[[4]]),ncol = 5,draw = F)
Heatmap plots
suppressPackageStartupMessages(library(cowplot))
p.denoised=ggdraw()+draw_image(magick::image_read_pdf("denoised_heatmap_nosmooth.pdf"),scale=1)
p.raw=ggdraw()+draw_image(magick::image_read_pdf("./raw_pseudotime_heatmap_nosmooth.pdf"),scale=1)
p_heatmap=ggdraw()+draw_plot(p.denoised,x=0,y=0,width=0.5,height=0.98)+
draw_plot(p.raw,x=0.5,y=0,width=0.5,height=0.98)+
draw_label("Denoised gene expression (pseudotime by CarDEC)", x=0.25,y=1,hjust=0.5,vjust = 1,size=20)+
draw_label("Raw gene expression (pseudotime by Raw)", x=0.75,y=1,hjust=0.5,vjust = 1,size=20)
p_heatmap

boxplot of pval
df0_tmp=subset(df_plot,genetype=="HVG" & variable=="Overall (Chisq)" &!Group %in%"Raw")
df0_tmp$Group=factor(df0_tmp$Group,levels = c("CarDEC","Scanorama","DCA","scVI","MNN"))
#df0_tmp%>%group_by(Group)%>%summarise(n=n())
p1=ggplot(df0_tmp,
aes(x=Group,y=pval_log10,fill=Group))+
#geom_violin(aes(x=variable,y=pval),scale = "width",adjust=1)+
geom_boxplot(width=0.5,color="blue",outlier.color = NA,size=0.2,
position = position_dodge(0.6))+
#ggrepel::geom_label_repel(data=df_plot_infinite,label="ere",position = position_dodge(1))+
geom_jitter(alpha=0.4,size=0.4,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.6))+
#geom_point(data=df_plot%>%group_by(variable,Group)%>%summarise(mean=mean(pval)),
# aes(x=variable,y=mean),size=2,color="white",position = position_dodge(1))+
theme_cowplot()+
geom_hline(yintercept = 2,color="red",lty=3)+
theme(axis.text.x = element_text(angle=20,hjust=1))+
ylab(expression(paste("-",log[10],"(",p,".",value,")")))+
scale_fill_manual(values=Methods_color)+
xlab("")+
ggtitle("HVGs (23 genes)")+
theme(legend.title = element_blank(),legend.position = "none",plot.title = element_text(hjust=0.5))
df0_tmp=subset(df_plot,genetype=="LVG" & variable=="Overall (Chisq)" &!Group %in%"Raw")
df0_tmp$Group=factor(df0_tmp$Group,levels = c("CarDEC","Scanorama","DCA","scVI","MNN"))
#df0_tmp%>%group_by(Group)%>%summarise(n=n())
p2=ggplot(df0_tmp,
aes(x=Group,y=pval_log10,fill=Group))+
#geom_violin(aes(x=variable,y=pval),scale = "width",adjust=1)+
geom_boxplot(width=0.5,color="blue",outlier.color = NA,size=0.2,
position = position_dodge(0.6))+
#ggrepel::geom_label_repel(data=df_plot_infinite,label="ere",position = position_dodge(1))+
geom_jitter(alpha=0.4,size=0.4,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.6))+
#geom_point(data=df_plot%>%group_by(variable,Group)%>%summarise(mean=mean(pval)),
# aes(x=variable,y=mean),size=2,color="white",position = position_dodge(1))+
theme_cowplot()+
geom_hline(yintercept = 2,color="red",lty=3)+
theme(axis.text.x = element_text(angle=20,hjust=1))+
ylab(expression(paste("-",log[10],"(",p,".",value,")")))+
scale_fill_manual(values=Methods_color)+
xlab("")+
ggtitle("LVGs (38 genes)")+
theme(legend.title = element_blank(),legend.position = "none",plot.title = element_text(hjust=0.5))
#coord_cartesian(ylim=c(0,20))

- the median value for each method in above figure
subset(df_plot, variable=="Overall (Chisq)" &!Group %in%"Raw")%>%
group_by(genetype,Group)%>%
summarise(log_pval_median=median(pval_log10),
logp_mean=mean(pval_log10),
n=n())
Figure 6
width=18
height=20
p6=ggdraw()+draw_plot(p_heatmap,x = 0,y = 1-8/height,height = 8/height,width = 1)+
draw_plot(p_box,x=0,y=0.1/height,height = 11.8/height,width=4.5/width)+
draw_plot(pp0,x=4.5/width,y=0,height = 11.7/height,width = 13.5/width)+
draw_label("a", x=0,y=1,hjust=0,vjust = 1,size=30)+
draw_label("b", x=0.5,y=1,hjust=0,vjust = 1,size=30)+
draw_label("c", x=0,y=12/height,hjust=0,vjust = 1,size=30)+
draw_label("d", x=4.5/width,y=12/height,hjust=0,vjust = 1,size=30)+
draw_label("CarDEC", x=(4.5+13.5/10*1+0.2)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")+
draw_label("Scanorama", x=(4.5+13.5/10*3-0.1)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")+
draw_label("DCA", x=(4.5+13.5/10*5-0.27)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")+
draw_label("scVI", x=(4.5+13.5/10*7-0.5)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")+
draw_label("MNN",x=(4.5+13.5/10*9-0.68)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")
p6

---
title: "The results for monocytes dataset"
subtitle: "raw gene expression matrix, CarDEC, DCA, scVI and Scanorama"
author:  Xiangjie Li
date: "`r format(Sys.time(), '%m/%d/%Y')`"
output:
  html_notebook:
    number_sections: yes
    toc: yes
  jekyllthat::jekylldown:
  html_document:
    df_print: paged
    toc: yes
    number_sections: yes
  prettydoc::html_pretty:
    theme: cayman
    highlight: github
    math: katex
    toc: yes
---

<style>
pre {
  max-height: 200px;
  float: left;
  width: 910px;
  overflow-y: auto;
}
pre.r {
  max-height: none;
}
</style>

- Data Summary 

This dataset was generated by our group, which can be downloaded from [GSE146974](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE146974) or [https://drive.google.com/file/d/1kR8Hhufoo2h2OtomW8n3kM0gaQhVS564/view?usp=sharing](https://drive.google.com/file/d/1kR8Hhufoo2h2OtomW8n3kM0gaQhVS564/view?usp=sharing). This dataset was generated from human peripheral blood mononuclear clear cells by Ficoll Separation followed by CD14 and CD16 positive cell selection. Since the CD14 and CD16 antibodies are not 100% specific, some T cells were also present in the scRNA-seq data. We performed clustering analysis using leiden’s algorithm for each batch and identified 288 T cells in total based on the T cell marker genes CD3D, CD3E and CD3G. Aftering removing these 288 T cells, there are 10,878 cells and 21,289 genes, which was processed and sequenced at three different days, resulting in three batches (3,640 cells in T1, 4,833 cells in T2 and 2,405 cells in T3) left in the remaining analysis. 

__***Human monocyte preparation***__: Monocyte preparation uses a modification of published protocols. Briefly, ~20 ml blood drawn in sodium heparin was processed immediately in the lab in the Clinical Research Center at Columbia University. PBMCs were isolated by gradient Ficoll paque centrifugation, which maintains cell viability and prevents ex vivo activation during cell recovery. Cells were stained with antibodies against human HLADR, CD14 and CD16 and monocyte subsets defined as HLADR+CD14++CD16-(classical), HLADR+CD14++CD16+ (intermediate), HLADR+CD14dim/CD16++ (nonclassical, patrolling monocyte). DAPI staining was used to exclude dead cells. Monocytes were sorted by a BD Influx Sorter into tubes for real-time 10x Genomics analysis.


```{r}
options(warn=-1) # turn off warning message globally
.libPaths(c("/home/xiaoxiang/R/x86_64-pc-linux-gnu-library/3.5",.libPaths()))
Sys.setenv(RETICULATE_PYTHON_ENV="/home/xiaoxiang/anaconda3/envs/cardec")#="/home/xiaoxiang/.conda/envs/DESCVIR"

#Sys.setenv(RETICULATE_PYTHON="/usr/bin/python3")
#RETICULATE_PYTHON="/home/xiaoxiang/anaconda3/bin/python3",
if ("Seurat" %in% loadedNamespaces()) detach("package:Seurat",unload = T)
dyn.load("/home/xiaoxiang/R/x86_64-pc-linux-gnu-library/3.5/sf/libs/sf.so")
#suppressPackageStartupMessages(library(monocle,lib.loc = "/usr/lib/R/monocle_alpha"))# devtools::install_github("")
#devtools::install_github("cole-trapnell-lab/DDRTree", ref="simple-ppt-like",lib="/usr/lib/R/monocle_alpha")
#devtools::install_github("r-spatial/sf") if 
#install.packages("~/Downloads/monocle-release-monocle3_alpha/", repos = NULL,lib = "/usr/lib/R/monocle_alpha")
suppressPackageStartupMessages(library(reticulate))
use_condaenv("cardec")

#suppressPackageStartupMessages(library(devtools))
suppressPackageStartupMessages(library(monocle))

#suppressPackageStartupMessages(library(flexclust))
#suppressPackageStartupMessages(library(mcclust))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggjoy))
suppressPackageStartupMessages(library(VGAM))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(kableExtra))
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(cowplot))
#py_install('umap-learn', pip = T, pip_ignore_installed = T)
#import("leiden")
#fig_path="/home/xiaoxiang/Documents/DESC_paper_prepare/DESC_paper_final/formal_revised/figures_sep/"
datadirpath="./"
knitr::opts_chunk$set(echo=T)
```

```{r}
if(R.version$os=="linux-gnu"){
  #in my ubuntu computer
  suppressPackageStartupMessages(library("Seurat",lib.loc = "/usr/lib/R/self_library/"))
  suppressPackageStartupMessages(library("SeuratWrappers",lib.loc = "/usr/lib/R/self_library/"))
}else{
  #in my macbook
  suppressPackageStartupMessages(library(Seurat))
  suppressPackageStartupMessages(library(SeuratWrappers))
}
```


```{r}
# load necessay function
#source("/media/xiaoxiang/D/DESC_reproducible_file/helpfunc_new.R")
#source("/media/xiaoxiang/D/Upenn_computer_backup/Documents/Human_Heart_Project/heart/Heart_result_updated/helpfunc_new.R")
old=theme_set(theme_bw()+theme(strip.background = element_rect(fill="white"),
                                         panel.background = element_blank(),
                               legend.background = element_blank(),
                                         panel.grid =element_blank()))

BatchKL=function(df,dimensionData=NULL,replicates=200,n_neighbors=100,n_cells=100,batch="BatchID"){
  #entropy of batch mixiing
  #replicates is the number of boostrap times
  #n_neighbors is the number of nearest neighbours of cell(from all batchs)
  #n_cells is the number of randomly picked cells
  if (is.null(dimensionData)){
        tsnedata=as.matrix(df[,c("tSNE_1","tSNE_2")])
  }else{
        tsnedata=as.matrix(dimensionData)
  }
  batchdata=factor(as.vector(df[,batch]))
  table.batchdata=as.matrix(table(batchdata))[,1]
  tmp00=table.batchdata/sum(table.batchdata)#proportation of population
  n=dim(df)[1]
  KL=sapply(1:replicates,function(x){
    bootsamples=sample(1:n,n_cells)
    #nearest=nn2(tsnedata,tsnedata[bootsamples,],k=n_neighbors)
    nearest=nabor::knn(tsnedata,tsnedata[bootsamples,],k=min(5*length(tmp00),n_neighbors))
    KL_x=sapply(1:length(bootsamples),function(y){
      id=nearest$nn.idx[y,]
      tmp=as.matrix(table(batchdata[id]))[,1]
      tmp=tmp/sum(tmp)
      return(sum(tmp*log2(tmp/tmp00),na.rm = T))
    })
    return(mean(KL_x,na.rm = T))
  })
  return(KL)
}
```

```{r}
Convert_to_seurat3=function(adata){
  suppressPackageStartupMessages(library("Seurat",lib.loc = "/usr/lib/R/self_library/"))
  mtx=py_to_r(adata$X$T$tocsc())
  cellinfo=py_to_r(adata$obs)
  geneinfo=py_to_r(adata$var)
  colnames(mtx)=cellinfo$cellname
  rownames(mtx)=rownames(geneinfo)
  obj=CreateSeuratObject(mtx,meta.data = cellinfo[,!colnames(cellinfo)%in%c("n_genes","n_counts"),drop=F],min.features  = 1)
  return(obj)
}
getwd()
```

# reading raw data 

```{r}
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad("../../dca_test.h5ad")#dca_test.h5ad is the monocyte data used for CarDEC
obj0=Convert_to_seurat3(adata)
obj0=NormalizeData(obj0,verbose = F)
raw.data=obj0@assays$RNA@counts
```

```{r}
maprules=c("2017_0801"="T1","2017_1017"="T2","2017_1120"="T3")
maprules
```

# reading processed data by CarDEC

```{r}
adata=ad$read_h5ad("../CarDEC Results/adata_CarDEC.h5ad")
```


```{r}
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann0=py_to_r(adata$var)

mtx=t(py_to_r(adata$layers['denoised counts']))
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann0)
mtx_sizefactor=1e4/colSums(mtx)
```

```{r}
obj1=CreateSeuratObject(mtx,meta.data = cell.meta.data)
obj1=NormalizeData(obj1,verbose = F)
Idents(obj1)="BatchID" #obj1 means denoised count by CarDEC
```

```{r}
avg_exp=Seurat:::FastExpMean(obj1@assays$RNA@counts,display_progress = F)#log1p(
gene_ann_all=data.frame(gene_short_name = make.unique(rownames(gene_ann0)),
                    VarianceType=gene_ann0$`Variance Type`,
                    avg_exp=avg_exp,
                    row.names = make.unique(rownames(gene_ann0)))
```



```{r}
#assign pseudotime for obj0
cds=readRDS("./cds_cardec.rds") #csd_cardec is the monocle's output from CarDEC
xtmp=colData(cds)[,c("Pseudotime")]%>%as.data.frame()
obj0$Pseudotime=colData(cds)$Pseudotime
obj1$Pseudotime=colData(cds)$Pseudotime
```


```{r}
obj0=NormalizeData(obj0,verbose = F)
obj0=FindVariableFeatures(obj0,verbose = F)
xx=obj0@assays$RNA@meta.features[,c("vst.mean"),drop=F]
xx$gene=rownames(xx)
```

We obtained TFs related to monocytes from Figure 4 in [Gene expression profiling reveals the defining features of the classical, intermediate, and nonclassical human monocyte subsets](https://ashpublications.org/blood/article/118/5/e16/29016/Gene-expression-profiling-reveals-the-defining) 

```{r}
tf_df=openxlsx::read.xlsx("./TF_blood.xlsx")%>%filter(Gene%in%rownames(obj0))%>%
  left_join(gene_ann_all,by=c("Gene"="gene_short_name"))%>%
  left_join(xx,by=c("Gene"="gene"))%>%
  as.data.frame()
rownames(tf_df)=tf_df$Gene
tf_df
```


```{r}
num_cells_expressed=Matrix::rowSums(obj0@assays$RNA@counts[tf_df$Gene,]!=0)
tf_df$num_cells_expressed=num_cells_expressed[rownames(tf_df)]
```


# Heatmap-no smoothed

**Note:** genes with color <span style="color:#377EB8">  blue </span>  means LVGs and genes with <span style="color:#E41A1C"> red </span> means HVGs. We firstly filtered out genes with average expression lower than 0.1 and sorted the genes decreasingly by average expression grouped by in each gene class (I set 3 clusters for gene, but didn't show here) and highlighted the top 15 genes. 

## pseudotime from cardec's denoised gene expression

```{r}
obj1=NormalizeData(obj1,verbose = F)
obj1=FindVariableFeatures(obj1,verbose = F)
hvf.info=obj1@assays$RNA@meta.features
gene22=hvf.info[order(hvf.info$vst.variance.standardized,  decreasing = TRUE), ,drop = FALSE]
num_cells_expressed=Matrix::rowSums(obj1@assays$RNA@counts[tf_df$Gene,]!=0)
obj1=ScaleData(obj1,features = tf_df$Gene,verbose = F)
```



```{r}
#scaled
#m = t(scale(m1,center = T))
m=t(FetchData(obj1,vars = tf_df$Gene,slot = "scale.data"))
m[is.nan(m)] = 0
m[m >= 3] = 3
m[m <= -3] = -3
heatmap_matrix <- m
row_dist <- as.dist((1 - cor(t(heatmap_matrix)))/2)
res2 <- list(ph=pheatmap::pheatmap(heatmap_matrix, useRaster = T, cluster_cols = FALSE, 
        cluster_rows = T, show_rownames = F, show_colnames = F, 
        clustering_distance_rows = row_dist, clustering_method =  "ward.D2", 
        cutree_rows = 3, silent = TRUE, filename = NA),m=heatmap_matrix)
 
```


```{r}
z=res2[[2]][res2[[1]]$tree_row$order,]
Cluster_labels=cutree(res2[[1]]$tree_row,3)[res2[[1]]$tree_row$order]
row_anno=data.frame(genename=rownames(z),
                    Cluster=Cluster_labels,
                    Class=tf_df[rownames(z),"Gene"],
                    VarianceType=tf_df[rownames(z),"VarianceType"],
                    GeneType=tf_df[rownames(z),"GeneType"],
                    #avg_exp=tf_df[rownames(z),"avg_exp"],
                    vst.mean=gene22[rownames(z),"vst.mean"],
                    vst.variance.standard=gene22[rownames(z),"vst.variance.standardized"],
                    num_cells_expressed=num_cells_expressed[rownames(z)],
                    row.names = rownames(z),stringsAsFactors = F)
row_anno$GeneClass=plyr::mapvalues(row_anno$Cluster,from=sort(unique(row_anno$Cluster)),
                                   to=paste0("Module",as.numeric(factor(sort(unique(Cluster_labels))))))

#order_id0=order(row_anno$Cluster,row_anno$GeneType,row_anno$VarianceType)
#z=z[order_id0,]
#row_anno=row_anno[order_id0,]
#GeneClass.color=gg_color_hue(length(unique(Cluster_labels)))
GeneClass.color=RColorBrewer::brewer.pal(3,"Dark2")
names(GeneClass.color)=paste0("Module",as.numeric(factor(sort(unique(Cluster_labels)))))
# gene Module color 
row_anno$color=plyr::mapvalues(row_anno$GeneClass,from=names(GeneClass.color),
                                   to=GeneClass.color)
row_anno$HVG_color=plyr::mapvalues(as.character(row_anno$VarianceType),from=c("HVG","LVG"),
                                   to=c("#E41A1C","#377EB8"))
```

```{r}
rownames(row_anno)=row_anno$genename
pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(monocle:::blue2green2red(150))(150))
# columan annotation
#col_anno=data.frame(pseudotime=seq(0,1,length=ncol(z)),
#                    row.names = as.character(1:ncol(z)))
col_anno=data.frame(BatchID=plyr::mapvalues(obj1$BatchID,c("MH001","RP002","RP009"),c("T1","T2","T3")),
                    Pseudotime=obj1$Pseudotime,
                    row.names = colnames(obj1))
col_id=order(col_anno$Pseudotime)
z=z[,col_id]
col_anno=col_anno[col_id,]
# 
set.seed(10)

row_anno.tmp=row_anno%>%
  filter(vst.mean>0.1)%>%
  group_by(VarianceType)%>%
  arrange(desc(vst.mean),.by_group = TRUE)%>%
  mutate(n_order=1:n())%>%as.data.frame()

#id.select=order(row_anno$qval,decreasing = F)[1:30]
id.select=which(row_anno$genename%in%row_anno.tmp$genename[row_anno.tmp$n_order%in%c(1:20)])
row_anno.new=row_anno[id.select,]
col_fun = circlize::colorRamp2(seq(-3, 3,length=200), colorRampPalette(c("blue", "white", "red"))(200))
#pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(c("darkblue","black","#FFFF00"))(150))
pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(monocle:::blue2green2red(150))(150))
top_anno=HeatmapAnnotation( Pseudotime = anno_simple(col_anno$Pseudotime, col = pseudotime_col_fun),
                            BatchID=col_anno$BatchID,
                             col=list(BatchID=c("T1"="#66C2A5","T2"="#FC8D62","T3"="#8DA0CB")),
                          show_legend = c(T,T),
                          annotation_name_side = "right")

left_anno=rowAnnotation(#GeneClass=row_anno$GeneClass,
                        #GeneType=row_anno$GeneType,
                         VarianceType=row_anno$VarianceType,
                      #Nonzero=anno_barplot(log10(row_anno$num_cells_expressed),
                      #                              gp  = gpar(fill ="#00C4FF",col=NA),
                      #                              bar_width = 1,
                      #                              height = unit(2.5, "cm")),
                      
                        annotation_name_rot=90,
                      annotation_name_side = "top",
                        #col=list(GeneClass=celltype_color[names(celltype_color)%in%unique(tmp.anno$celltype)]),
                        col=list(GeneClass=GeneClass.color,
                                 VarianceType=c("HVG"="#E41A1C","LVG"="#377EB8")),
                        show_annotation_name=c(F,T),
                        show_legend=c(T,T))
p1=ComplexHeatmap::Heatmap(z, name = "scaled.expression", 
                      cluster_rows = F,
                      cluster_columns = F,
                      col=col_fun,
                      column_labels =  rep("",length=ncol(z)),
                      row_labels = rep("",length=nrow(z)),
                      #row_labels = gene.use.df$gene,
                      row_names_side = "right",
                      #the following fontsize is nonusefull for this situattion
                      row_names_gp = gpar(fontsize=5),
                      column_names_gp = gpar(fontsize = 8),
                      #column_gap = unit(0.5, "mm"),
                      #column_split = tmp.anno$res,
                      #column_title = "%s",
                      #column_title_rot=90,
                      #row_gap = unit(0.5, "mm"),
                      #row_split = row_anno$GeneClass,
                      #row_title = "%s",
                      row_title_gp = gpar(fontsize = 12),
                      row_title_rot = 90,
                      top_annotation = top_anno,
                      left_annotation= left_anno,
                       heatmap_legend_param=list(legend_direction="horizontal",legend_width = unit(2.5, "cm")),
                      right_annotation = rowAnnotation(foo=anno_mark(at=id.select,
                                                                     side="right",
                                                                     extend=unit(0.2,"cm"),
                                                                     labels_gp = gpar(fontsize=8,col=row_anno$HVG_color[id.select]),
                                                                     labels=row_anno.new$genename),
                                                       annotation_name_side="top"))
```


```{r,fig.width=9,fig.height=8}
#draw(p1,padding=unit(c(2,2,2,2),"mm"))
lgd_pse=Legend(title = "Pseudotime", col = pseudotime_col_fun, at = c(0,0.5,1), 
    labels = c("low","med","high"),legend_height = unit(2.5, "cm"),border = NA,title_position = "topcenter",direction="horizontal")
draw(p1,annotation_legend_list = list(lgd_pse),merge_legends=T,heatmap_legend_side="bottom")
```

## pseudotime from raw gene expression

```{r,echo=F,include=F,eval=F}
if(file.exists("./cds_raw.rds")){
  cds=readRDS("./cds_raw.rds")
}else{
  cell.meta.data=obj0@meta.data
  cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
  gene_ann=data.frame(gene_short_name = make.unique(rownames(raw.data)),row.names = make.unique(rownames(raw.data)))
  #pd <- new("AnnotatedDataFrame",data=cell.meta.data)
  #fd <- new("AnnotatedDataFrame",data=gene_ann)
  cds <- new_cell_data_set(raw.data, cell_metadata = cell.meta.data,gene_metadata =gene_ann)
  ## Step 1: Normalize and pre-process the data
  cds <- preprocess_cds(cds, num_dim = 32,method="PCA",norm_method="log",verbose = F)
  ## Step 2: Remove batch effects with cell alignment
  ##cds <- align_cds(cds, alignment_group = "BatchID", residual_model_formula_str = NULL)
  ## Step 3: Reduce the dimensions using UMAP
  cds <- reduce_dimension(cds,reduction_method = "UMAP",preprocess_method="PCA",verbose = F)
  
  ## Step 4: Cluster the cells
  cds <- cluster_cells(cds,reduction_method ="UMAP",cluster_method = "leiden",verbose = F)
  # Construct the graph
  # Note that, for the rest of the code to run, the graph should be fully (partionly) connected
  ## Step 5: Learn a graph
  cds <- learn_graph(cds, use_partition = T,verbose = F)
  colData(cds)$clusters=cds@clusters$UMAP$clusters
}
pp1=plot_cells(cds,color_cells_by = "partition",label_cell_groups = F)+theme(legend.position = "top")
pp2=plot_cells(cds,color_cells_by = "clusters",label_cell_groups=F,graph_label_size=2, label_leaves=F,label_branch_points=F)+theme(legend.position = "top")
p=cowplot::plot_grid(pp1,pp2,align = "h",ncol = 3)

## Step 6: Order cells
# a helper function to identify the root principal points:
get_earliest_principal_node <- function(cds, cluster=c("1","5")){
  root_pr_nodes=sapply(cluster,function(ii){
    cell_ids <- which(colData(cds)[, "clusters"] %in%ii)
  
  closest_vertex <-cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
  
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-igraph::V(principal_graph(cds)[["UMAP"]])$name[as.numeric(names(which.max(table(closest_vertex[cell_ids,]))))]
  })
  root_pr_nodes
}
# root cells
ids=get_earliest_principal_node(cds,cluster=c("1","4","5"))
cds <- order_cells(cds,root_pr_nodes = ids)
#plot_cells(cds,color_cells_by = "pseudotime")

#set pseudotime
colData(cds)$pseudotime=pseudotime(cds)
colData(cds)$Pseudotime=colData(cds)$pseudotime/max(colData(cds)$pseudotime,na.rm = T)
saveRDS(cds,file="cds_raw.rds")
```

```{r}
cds=readRDS("./cds_raw.rds")
```

```{r}
print(sum(colnames(obj0)==colnames(cds)))
#10878
obj0$Pseudotime_raw=colData(cds)$Pseudotime

obj0=NormalizeData(obj0,verbose = F)
obj0=FindVariableFeatures(obj0,verbose = F)
hvf.info=obj0@assays$RNA@meta.features
gene22=hvf.info[order(hvf.info$vst.variance.standardized,  decreasing = TRUE), ,drop = FALSE]
num_cells_expressed=Matrix::rowSums(obj0@assays$RNA@counts[tf_df$Gene,]!=0)
obj0=ScaleData(obj0,features = tf_df$Gene,verbose = F)
```


```{r}
#scaled
#m = t(scale(m1,center = T))
m=t(FetchData(obj0,vars = tf_df$Gene,slot = "scale.data"))
m[is.nan(m)] = 0
m[m >= 3] = 3
m[m <= -3] = -3
heatmap_matrix <- m
row_dist <- as.dist((1 - cor(t(heatmap_matrix)))/2)
res2 <- list(ph=pheatmap::pheatmap(heatmap_matrix, useRaster = T, cluster_cols = FALSE, 
        cluster_rows = T, show_rownames = F, show_colnames = F, 
        clustering_distance_rows = row_dist, clustering_method =  "ward.D2", 
        cutree_rows = 3, silent = TRUE, filename = NA),m=heatmap_matrix)
 
```


```{r}
z=res2[[2]][res2[[1]]$tree_row$order,]
Cluster_labels=cutree(res2[[1]]$tree_row,3)[res2[[1]]$tree_row$order]
row_anno=data.frame(genename=rownames(z),
                    Cluster=Cluster_labels,
                    Class=tf_df[rownames(z),"Gene"],
                    VarianceType=tf_df[rownames(z),"VarianceType"],
                    GeneType=tf_df[rownames(z),"GeneType"],
                    vst.mean=gene22[rownames(z),"vst.mean"],
                    vst.variance.standard=gene22[rownames(z),"vst.variance.standardized"],
                    num_cells_expressed=num_cells_expressed[rownames(z)],
                    row.names = rownames(z),stringsAsFactors = F)
row_anno$GeneClass=plyr::mapvalues(row_anno$Cluster,from=sort(unique(row_anno$Cluster)),
                                   to=paste0("Module",as.numeric(factor(sort(unique(Cluster_labels))))))

#order_id0=order(row_anno$Cluster,row_anno$GeneType,row_anno$VarianceType)
#z=z[order_id0,]
#row_anno=row_anno[order_id0,]
#GeneClass.color=gg_color_hue(length(unique(Cluster_labels)))

GeneClass.color=RColorBrewer::brewer.pal(3,"Dark2")
names(GeneClass.color)=paste0("Module",as.numeric(factor(sort(unique(Cluster_labels)))))
# gene Module color 
row_anno$color=plyr::mapvalues(row_anno$GeneClass,from=names(GeneClass.color),
                                   to=GeneClass.color)
row_anno$HVG_color=plyr::mapvalues(as.character(row_anno$VarianceType),from=c("HVG","LVG"),
                                   to=c("#E41A1C","#377EB8"))
```

```{r}
rownames(row_anno)=row_anno$genename
pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(monocle:::blue2green2red(150))(150))
# columan annotation
#col_anno=data.frame(pseudotime=seq(0,1,length=ncol(z)),
#                    row.names = as.character(1:ncol(z)))
col_anno=data.frame(BatchID=plyr::mapvalues(obj0$BatchID,c("MH001","RP002","RP009"),c("T1","T2","T3")),
                    Pseudotime=obj0$Pseudotime_raw,
                    #Pseudotime=obj0$Pseudotime,
                    row.names = colnames(obj0))
col_id=order(col_anno$Pseudotime)
z=z[,col_id]
col_anno=col_anno[col_id,]
# 
set.seed(10)

row_anno.tmp=row_anno%>%
  filter(vst.mean>0.1)%>%
  group_by(VarianceType)%>%
  arrange(desc(vst.mean),.by_group = TRUE)%>%
  mutate(n_order=1:n())%>%as.data.frame()

#id.select=order(row_anno$qval,decreasing = F)[1:30]
id.select=which(row_anno$genename%in%row_anno.tmp$genename[row_anno.tmp$n_order%in%c(1:20)])
row_anno.new=row_anno[id.select,]
col_fun = circlize::colorRamp2(seq(-3, 3,length=200), colorRampPalette(c("blue", "white", "red"))(200))
#pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(c("darkblue","black","#FFFF00"))(150))
pseudotime_col_fun =circlize::colorRamp2(seq(0, 1,length=150), colorRampPalette(monocle:::blue2green2red(150))(150))
top_anno=HeatmapAnnotation( Pseudotime = anno_simple(col_anno$Pseudotime, col = pseudotime_col_fun),
                            BatchID=col_anno$BatchID,
                             col=list(BatchID=c("T1"="#66C2A5","T2"="#FC8D62","T3"="#8DA0CB")),
                          show_legend = c(T,T),
                          annotation_name_side = "right")

left_anno=rowAnnotation(#GeneClass=row_anno$GeneClass,
                        #GeneType=row_anno$GeneType,
                         VarianceType=row_anno$VarianceType,
                      #Nonzero=anno_barplot(log10(row_anno$num_cells_expressed),
                      #                              gp  = gpar(fill ="#00C4FF",col=NA),
                      #                              bar_width = 1,
                      #                              height = unit(2.5, "cm")),
                      
                        annotation_name_rot=90,
                      annotation_name_side = "top",
                        #col=list(GeneClass=celltype_color[names(celltype_color)%in%unique(tmp.anno$celltype)]),
                        col=list(GeneClass=GeneClass.color,
                                 VarianceType=c("HVG"="#E41A1C","LVG"="#377EB8")),
                        show_annotation_name=c(F,T),
                        show_legend=c(T,T))
p2=ComplexHeatmap::Heatmap(z, name = "scaled.expression", 
                      cluster_rows = F,
                      cluster_columns = F,
                      col=col_fun,
                      column_labels =  rep("",length=ncol(z)),
                      row_labels = rep("",length=nrow(z)),
                      #row_labels = gene.use.df$gene,
                      row_names_side = "right",
                      #the following fontsize is nonusefull for this situattion
                      row_names_gp = gpar(fontsize=5),
                      column_names_gp = gpar(fontsize = 8),
                      #column_gap = unit(0.5, "mm"),
                      #column_split = tmp.anno$res,
                      #column_title = "%s",
                      #column_title_rot=90,
                      #row_split =gene.use.df$cluster,
                      row_gap = unit(0.5, "mm"),
                      #row_split = row_anno$GeneClass,
                      #row_title = "%s",
                      row_title_gp = gpar(fontsize = 12),
                      row_title_rot = 90,
                      top_annotation = top_anno,
                      left_annotation= left_anno,
                       heatmap_legend_param=list(legend_direction="horizontal",legend_width = unit(2.5, "cm")),
                      right_annotation = rowAnnotation(foo=anno_mark(at=id.select,
                                                                     side="right",
                                                                     extend=unit(0.2,"cm"),
                                                                     labels_gp = gpar(fontsize=8,col=row_anno$HVG_color[id.select]),
                                                                     labels=row_anno.new$genename),
                                                       annotation_name_side="top"))
```


```{r,fig.width=9,fig.height=8}
#draw(p1,padding=unit(c(2,2,2,2),"mm"))
lgd_pse=Legend(title = "Pseudotime", col = pseudotime_col_fun, at = c(0,0.5,1), 
    labels = c("low","med","high"),legend_height = unit(2.5, "cm"),border = NA,title_position = "topcenter",direction="horizontal")
draw(p2,annotation_legend_list = list(lgd_pse),merge_legends=T,heatmap_legend_side="bottom")
```

```{r}
#save plots
suppressPackageStartupMessages(library(cowplot))
lgd_pse=Legend(title = "Pseudotime", col = pseudotime_col_fun, at = c(0,0.5,1), 
    labels = c("low","med","high"),legend_height = unit(2.5, "cm"),border = NA,title_position = "topcenter",direction="horizontal")

cairo_pdf("denoised_heatmap_nosmooth.pdf",width=9,height=8)
    draw(p1,annotation_legend_list = list(lgd_pse),merge_legends=T,heatmap_legend_side="bottom")
dev.off()

tiff("denoised_heatmap_nosmooth.tiff",units="in",compression="lzw",res=300,width=9,height=8)
    draw(p1,annotation_legend_list = list(lgd_pse),merge_legends=T,heatmap_legend_side="bottom")
dev.off()

cairo_pdf("raw_pseudotime_heatmap_nosmooth.pdf",width=9,height=8)
    draw(p2,annotation_legend_list = list(lgd_pse),merge_legends=T,heatmap_legend_side="bottom")
dev.off()

tiff("raw_pseudotime_heatmap_nosmooth.tiff",units="in",compression="lzw",res=300,width=9,height=8)
    draw(p2,annotation_legend_list = list(lgd_pse),merge_legends=T,heatmap_legend_side="bottom")
dev.off()
```

# reading results from other methods 

```{r}
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad("../../dca_test.h5ad")
obj_raw=Convert_to_seurat3(adata)
obj_raw=NormalizeData(obj_raw,verbose = F)
obj_raw=FindVariableFeatures(obj_raw,verbose = F)
xx_vst.mean=obj_raw@assays$RNA@meta.features[,c("vst.mean"),drop=F]
xx_vst.mean$gene=rownames(xx_vst.mean)

raw.data=obj_raw@assays$RNA@counts
```


```{r}
maprules=c("2017_0801"="T1","2017_1017"="T2","2017_1120"="T3")
maprules

Methods_color=c("#E41A1C","#377EB8","#4DAF4A","#984EA3","#FF7F00","#00cc99")
names(Methods_color)=c("scVI","CarDEC","DCA","MNN","Raw","Scanorama")
```

```{r,fig.width=7,fig.height=3}
op=par(mar=c(5,4,6,4))
image(1:length(Methods_color),1, as.matrix(1:length(Methods_color)),col=Methods_color,xlab = "", ylab = "")
axis(3,at=seq(1:length(Methods_color)),labels=Methods_color,las=2,lwd=0)
par(op)
```


```{r}
adata=ad$read_h5ad("../CarDEC Results/adata_CarDEC.h5ad")
```


```{r}
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann_cardec=py_to_r(adata$var)

mtx=t(py_to_r(adata$layers['denoised counts']))
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann_cardec)
mtx_sizefactor=1e4/colSums(mtx)
```

```{r}
obj_cardec=CreateSeuratObject(mtx,meta.data = cell.meta.data)
obj_cardec=NormalizeData(obj_cardec,verbose = F)
Idents(obj_cardec)="BatchID" #obj1 means denoised count by CarDEC
```


```{r}
suppressPackageStartupMessages(library(monocle3))
cds=readRDS("./cds_cardec.rds")
obj_cardec$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
obj_raw$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
rm(cds)
gc()
cds=readRDS("./cds_raw.rds")
obj_raw$Pseudotime_raw=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
```

## DCA+combat

```{r}
adata=ad$read_h5ad("../final_processed_results/dca Results New/adata_all.h5ad")
```

```{r}
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann0=py_to_r(adata$var)
gene_ann=data.frame(gene_short_name = make.unique(rownames(gene_ann0)),
                    row.names = make.unique(rownames(gene_ann0)))
mtx=t(py_to_r(adata$X))
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann)
mtx_sizefactor=1e4/colSums(mtx)
obj_dca=CreateSeuratObject(mtx,meta.data=cell.meta.data)
#obj_dca=NormalizeData(object = obj_dca,verbose = F)
cds=readRDS("./cds_dca.rds")
obj_dca$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
```

## scVI

```{r}
adata=ad$read_h5ad("../final_processed_results/scVI Results New/monocytes_ALL/adata_all.h5ad")
```

```{r}
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann0=py_to_r(adata$var)
gene_ann=data.frame(gene_short_name = make.unique(rownames(gene_ann0)),
                    row.names = make.unique(rownames(gene_ann0)))
mtx=t(py_to_r(adata$X))
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann)
mtx_sizefactor=1e4/colSums(mtx)
obj_scvi=CreateSeuratObject(mtx,meta.data=cell.meta.data)
#obj_scvi=NormalizeData(object = obj_scvi,verbose = F)
cds=readRDS("./cds_scvi.rds")
obj_scvi$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm = T)
```

## MNN

```{r}
output=readRDS("../final_processed_results/MNN_corrected_all.rds")
mtx=output@assays$data$corrected
cell.meta.data=colData(output)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(mtx)),
                    row.names = make.unique(rownames(mtx)))
obj_mnn=CreateSeuratObject(mtx,meta.data=as.data.frame(cell.meta.data))
cds=readRDS("./cds_mnn.rds")
obj_mnn$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
```

## scanorama

```{r}
#add when revised in Genome Research
adata=ad$read_h5ad("../final_processed_results/scanorama Results/adata_ALL.h5ad")#
cell.meta.data=py_to_r(adata$obs)
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann0=py_to_r(adata$raw$var)
gene_ann=data.frame(gene_short_name = make.unique(rownames(gene_ann0)),
                    row.names = make.unique(rownames(gene_ann0)))
mtx=t(py_to_r(adata$raw$X$tocsc()))#
colnames(mtx)=cell.meta.data$cellname
rownames(mtx)=rownames(gene_ann)
obj_scanorama=CreateSeuratObject(mtx,meta.data=as.data.frame(cell.meta.data))
#obj_scanorama=NormalizeData(obj_scanorama,verbose = F)
cds=readRDS("./cds_scanorama.rds")
obj_scanorama$Pseudotime=colData(cds)$Pseudotime
#/max(colData(cds)$Pseudotime,na.rm=T)
```

```{r}
avg_exp=log1p(Seurat:::FastExpMean(obj_cardec@assays$RNA@counts,display_progress = F))
gene_ann_all=data.frame(gene_short_name = make.unique(rownames(obj_cardec)),
                    VarianceType=gene_ann_cardec$`Variance Type`,
                    avg_exp=avg_exp,
                    row.names = make.unique(rownames(obj_cardec)))
```

```{r}
tf_df=openxlsx::read.xlsx("./TF_blood.xlsx")%>%filter(Gene%in%rownames(obj_raw))%>%
  left_join(gene_ann_all,by=c("Gene"="gene_short_name"))%>%
  left_join(xx_vst.mean,by=c("Gene"="gene"))%>%
  as.data.frame()
rownames(tf_df)=tf_df$Gene
DT::datatable(tf_df)
```


After filtering out gene expression <0.05, we have 

```{r}
tf_df=subset(tf_df,tf_df$avg_exp>=0.05 )
DT::datatable(tf_df)
```

```{r}
#write.table(tf_df[,c(1,2,3)],file = "tmp.csv",sep=",",col.names = T,row.names = F)
```

```{r}
get_raw_denoised_df_new=function(obj,gene){
  #df0=data.frame(cbind(pseudotime=obj0$Pseudotime,FetchData(obj0,vars=gene)))
  df0=FetchData(object = obj,vars=gene)
  df0$pseudotime=obj$Pseudotime
  df0$Pseudotime_01=obj$Pseudotime/max(obj$Pseudotime,na.rm = T)# for plot 
  #colnames(df0)[2:ncol(df0)]=gsub(pattern = "-",replacement = "",x =gene )
  df0$BatchID=plyr::mapvalues(obj$batch_label,names(maprules),maprules)
  df0=df0[is.finite(df0$pseudotime),]
  df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
  df0$x=df0$pseudotime/max(df0$pseudotime)
  return(df0)
}
suppressPackageStartupMessages(library(mgcv))

get_pval_pair_new=function(df00,gene,return_pval_table=T){
  df00_tmp=df00
  id_gene=which(colnames(df00_tmp)==gene)
  gene_new=gsub("-|\\.","",gene)
  colnames(df00_tmp)[id_gene]=gene_new
  
  formula1=as.formula(paste0(gene_new,"~","s(pseudotime,bs=\"cs\")"))
  formula2=as.formula(paste0(gene_new,"~BatchID+","s(pseudotime,bs=\"cs\")"))
  mod1 <- gam(formula1, data = df00_tmp, select = F)
  mod2 <- gam(formula2, data = df00_tmp, select = F)
  m1=anova(mod1, mod2, test = "Chisq")
  m2=anova(mod1, mod2, test = "F")
  unique_batch=sort(as.character(unique(df00_tmp$BatchID)),decreasing = T)
  res_pairwise=lapply(unique_batch,function(x){
    df0tmp=df00_tmp[df00_tmp$BatchID!=x,,drop=F]
    mod1 <- gam(formula1, data = df0tmp, select = F)
    mod2 <- gam(formula2, data = df0tmp, select = F)
    m1=anova(mod1, mod2, test = "Chisq")
    m2=anova(mod1, mod2, test = "F")
    return(list(mod1=mod1,mod2=mod2,m1=m1,m2=m2))
  })
  names(res_pairwise)=unique_batch
  
  if(return_pval_table){
    col_names=c("gene",paste0(rep(c("T1 v.s. T2", "T1 v.s. T3","T2 v.s. T3"),times=2),
                              rep(c(" (Chisq)"," (F)"),each=3)),
                "Overall (Chisq)", "Overall (F)"
                )
    res0=data.frame(matrix(NA,ncol=length(col_names),nrow=1))
    colnames(res0)=col_names
    res0[1,1]=gene
    res0[1,2]=res_pairwise$T3$m1$`Pr(>Chi)`[2]
    res0[1,3]=res_pairwise$T2$m1$`Pr(>Chi)`[2]
    res0[1,4]=res_pairwise$T1$m1$`Pr(>Chi)`[2]
    res0[1,5]=res_pairwise$T3$m2$`Pr(>F)`[2]
    res0[1,6]=res_pairwise$T2$m2$`Pr(>F)`[2]
    res0[1,7]=res_pairwise$T1$m2$`Pr(>F)`[2]
    res0[1,8]=m1$`Pr(>Chi)`[2]
    res0[1,9]=m2$`Pr(>F)`[2]
    return(res0)
  }else{
    return(list(single=list(mod1=mod1,mod2=mod2,m1=m1,m2=m2),
              pariwise=res_pairwise))  
  }
  
}
```

```{r,echo=F,include=F,eval=F}
# get the p-value
obj_name_list=c("obj_raw","obj_scanorama","obj_cardec","obj_scvi","obj_dca","obj_mnn")

df0_list=lapply(obj_name_list,function(x) get_raw_denoised_df_new(get(x),gene=tf_df$Gene))
names(df0_list)=obj_name_list

res_pval_raw=do.call("rbind",lapply(tf_df$Gene,function(x){
    tmp00=get_pval_pair_new(df0_list[["obj_raw"]],gene=x)#raw
    tmp00$genetype=as.character(tf_df$VarianceType[tf_df$Gene==x])
    return(tmp00)
  }))
res_pval_raw$Group="Raw_0"


res_pval_cardec=do.call("rbind",lapply(tf_df$Gene,function(x){
   tmp00=get_pval_pair_new(df0_list[["obj_cardec"]],gene=x) #denoised
   tmp00$genetype=as.character(tf_df$VarianceType[tf_df$Gene==x])
  return(tmp00)
  }))
res_pval_cardec$Group="CarDEC"

res_pval_scvi=do.call("rbind",lapply(tf_df$Gene,function(x){
   tmp00=get_pval_pair_new(df0_list[["obj_scvi"]],gene=x) #denoised
   tmp00$genetype=tf_df$VarianceType[tf_df$Gene==x]
  return(tmp00)
  }))
res_pval_scvi$Group="scVI"

res_pval_dca=do.call("rbind",lapply(tf_df$Gene,function(x){
   tmp00=get_pval_pair_new(df0_list[["obj_dca"]],gene=x) #denoised
   tmp00$genetype=as.character(tf_df$VarianceType[tf_df$Gene==x])
  return(tmp00)
  }))
res_pval_dca$Group="DCA"

res_pval_mnn=do.call("rbind",lapply(tf_df$Gene,function(x){
   tmp00=get_pval_pair_new(df0_list[["obj_mnn"]],gene=x) #denoised
   tmp00$genetype=as.character(tf_df$VarianceType[tf_df$Gene==x])
  return(tmp00)
  }))
res_pval_mnn$Group="MNN"

res_pval_scanorama=do.call("rbind",lapply(tf_df$Gene,function(x){
   tmp00=get_pval_pair_new(df0_list[["obj_scanorama"]],gene=x) #denoised
   tmp00$genetype=as.character(tf_df$VarianceType[tf_df$Gene==x])
  return(tmp00)
  }))
res_pval_scanorama$Group="Scanorama"


#psuedotime from raw
df0=FetchData(obj_raw,vars=tf_df$Gene)
df0$pseudotime=obj_raw$Pseudotime_raw
df0$BatchID=obj_cardec$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
 
res_pval_raw_2=do.call("rbind",lapply(tf_df$Gene,function(x){
 tmp00=get_pval_pair_new(df0,gene=x) #denoised
 tmp00$genetype=as.character(tf_df$VarianceType[tf_df$Gene==x])
  return(tmp00)
}))
res_pval_raw_2$Group="Raw"

```

```{r,echo=F,include=F,eval=F}
saveRDS(list(res_pval_raw=res_pval_raw,
             res_pval_raw_2=res_pval_raw_2,
            res_pval_cardec=res_pval_cardec,
            res_pval_scvi=res_pval_scvi,
            res_pval_dca=res_pval_dca,
            res_pval_mnn=res_pval_mnn,
            res_pval_scanorama=res_pval_scanorama),file="res_pval_new_revised.rds")#res_pval_new.rds
```

```{r}
res_pval_list=readRDS("./res_pval_new_revised.rds")
```


# Final Plots

In this Section, I reported the respective pseudotime identified by compared methods, including MNN, scVI, DCA and scanorama

## Feature plots 

```{r}
old=theme_set(theme_bw()+theme(strip.background = element_rect(fill="white"),
                                         panel.background = element_blank(),
                               legend.background = element_blank(),
                                         panel.grid =element_blank()))
```



```{r}

df0_tmp0=subset(df_plot, variable=="Overall (Chisq)" &!Group %in%"Raw")

ggtitle0=c("CarDEC"="obj_cardec","DCA"="obj_dca","MNN"="obj_mnn","Raw"="obj_raw","scVI"="obj_scvi","Scanorama"="obj_scanorama")
get_plot1=function(df00,gene="S100A8",title0="CarDEC"){
  p_val_label=df0_tmp0$pval[df0_tmp0$gene==gene&df0_tmp0$Group==title0]
  p=ggplot(data =df00,aes_string(x="pseudotime",y=gene))+
      geom_point(aes(color=BatchID),size=0.01)+
      guides(color=guide_legend(override.aes = list(size=5)))+
      geom_smooth(aes(color=BatchID),method="gam",formula = y ~ s(x, bs="cs"))+
      geom_smooth(color="black",method="gam",formula = y ~ s(x, bs="cs"),size=0.5)+
      ylab(paste0(gene," (",as.character(tf_df$VarianceType[tf_df$Gene==gene]),")"))+
      ggtitle(title0,subtitle = paste0("p-value=",ifelse(length(p_val_label)!=0,scales::scientific(p_val_label),"NA")))+
    
    xlab("Pseudotime")+theme(legend.position = "right",
                               plot.title = element_text(size=18,face="bold",hjust=0.5),
                               legend.text = element_text(size=15,face="bold"),
                               axis.title.y = element_text(color=ifelse(as.character(tf_df$VarianceType[tf_df$Gene==gene])=="HVG","red","blue")),
                               legend.title = element_blank())+scale_color_brewer(palette = "Set2")
  return(p)
}


plist_0=lapply(sort(tf_df$Gene),function(gene0){
  p1=get_plot1(df0_list$obj_cardec,gene = gene0,title0 = "CarDEC")+theme(legend.position = "none")
  p2=get_plot1(df0_list$obj_dca,gene = gene0,title0 = "DCA")+theme(legend.position = "none")
  p3=get_plot1(df0_list$obj_mnn,gene = gene0,title0 = "MNN")
  p4=get_plot1(df0_list$obj_scvi,gene = gene0,title0 = "scVI")+theme(legend.position = "none")
  p5=get_plot1(df0_list$obj_scanorama,gene = gene0,title0 = "Scanorama")+theme(legend.position = "none")
  return(list(p1,p5,p2,p4,p3))
})
```

```{r,fig.height=3,fig.width=14}
plist_1=lapply(1:length(plist_0),function(x) egg::ggarrange(plots = plist_0[[x]],nrow = 1,draw = F))
```


```{r,fig.height=3,fig.width=14}
#If we need other genes we can save them using following codes
#for( x in 1:length(plist_0)){
#  ggsave(filename = paste0("./revised_figures/TFs_pseudotime/",sort(tf_df$Gene)[x],".pdf"),egg::ggarrange(plots = plist_0[[x]],nrow = 1),width = #14,height = 3)
#  ggsave(filename = paste0("./revised_figures/TFs_pseudotime/",sort(tf_df$Gene)[x],".tiff"),egg::ggarrange(plots = plist_0[[x]],nrow = 1),width = #14,height = 3,dpi=300,compression="lzw")
#}
```

```{r}
id0=c(23,22,25,19)
```

```{r,fig.width=14,fig.height=12}
plot_grid(plist_1[[23]],plist_1[[22]],plist_1[[25]],plist_1[[19]],align = "v",ncol = 1)
```

```{r}
plist_00=lapply(sort(tf_df$Gene)[c(23,22,25,19)],function(gene0){
  p1=get_plot1(df0_list$obj_cardec,gene = gene0,title0 = "CarDEC")+theme(legend.position = "none")+
    theme(plot.title = element_blank(),plot.margin = unit(c(0.2,0,0.5,0.2),"cm"))
  p2=get_plot1(df0_list$obj_dca,gene = gene0,title0 = "DCA")+theme(legend.position = "none")+theme(plot.title = element_blank())
  p3=get_plot1(df0_list$obj_mnn,gene = gene0,title0 = "MNN")+theme(plot.title = element_blank())
  p4=get_plot1(df0_list$obj_scvi,gene = gene0,title0 = "scVI")+theme(legend.position = "none")+theme(plot.title = element_blank())
  p5=get_plot1(df0_list$obj_scanorama,gene = gene0,title0 = "Scanorama")+theme(legend.position = "none")+theme(plot.title = element_blank())
  return(list(p1,p5,p2,p4,p3))
})
```

```{r fig.width=15,fig.height=12}
pp0=egg::ggarrange(plots = c(plist_00[[1]],plist_00[[2]],plist_00[[3]],plist_00[[4]]),ncol = 5,draw = F)
```

## Heatmap plots

```{r,fig.width=16,fig.height=7}
suppressPackageStartupMessages(library(cowplot))
p.denoised=ggdraw()+draw_image(magick::image_read_pdf("denoised_heatmap_nosmooth.pdf"),scale=1)
p.raw=ggdraw()+draw_image(magick::image_read_pdf("./raw_pseudotime_heatmap_nosmooth.pdf"),scale=1)

p_heatmap=ggdraw()+draw_plot(p.denoised,x=0,y=0,width=0.5,height=0.98)+
  draw_plot(p.raw,x=0.5,y=0,width=0.5,height=0.98)+
  draw_label("Denoised gene expression (pseudotime by CarDEC)", x=0.25,y=1,hjust=0.5,vjust = 1,size=20)+
  draw_label("Raw gene expression (pseudotime by Raw)", x=0.75,y=1,hjust=0.5,vjust = 1,size=20)

```

```{r,fig.width=18,fig.height=8}
p_heatmap
```

## boxplot of pval

```{r,fig.width=8,fig.height=4}
df0_tmp=subset(df_plot,genetype=="HVG" & variable=="Overall (Chisq)" &!Group %in%"Raw")
df0_tmp$Group=factor(df0_tmp$Group,levels = c("CarDEC","Scanorama","DCA","scVI","MNN"))

#df0_tmp%>%group_by(Group)%>%summarise(n=n())
p1=ggplot(df0_tmp,
       aes(x=Group,y=pval_log10,fill=Group))+
  #geom_violin(aes(x=variable,y=pval),scale = "width",adjust=1)+
  geom_boxplot(width=0.5,color="blue",outlier.color = NA,size=0.2,
               position = position_dodge(0.6))+
  #ggrepel::geom_label_repel(data=df_plot_infinite,label="ere",position = position_dodge(1))+
  geom_jitter(alpha=0.4,size=0.4,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.6))+
  #geom_point(data=df_plot%>%group_by(variable,Group)%>%summarise(mean=mean(pval)),
  #          aes(x=variable,y=mean),size=2,color="white",position = position_dodge(1))+
   theme_cowplot()+
  geom_hline(yintercept = 2,color="red",lty=3)+
  theme(axis.text.x = element_text(angle=20,hjust=1))+
  ylab(expression(paste("-",log[10],"(",p,".",value,")")))+
  scale_fill_manual(values=Methods_color)+
  xlab("")+
  ggtitle("HVGs (23 genes)")+
  theme(legend.title = element_blank(),legend.position = "none",plot.title = element_text(hjust=0.5))

df0_tmp=subset(df_plot,genetype=="LVG" & variable=="Overall (Chisq)" &!Group %in%"Raw")
df0_tmp$Group=factor(df0_tmp$Group,levels = c("CarDEC","Scanorama","DCA","scVI","MNN"))

#df0_tmp%>%group_by(Group)%>%summarise(n=n())
p2=ggplot(df0_tmp,
       aes(x=Group,y=pval_log10,fill=Group))+
  #geom_violin(aes(x=variable,y=pval),scale = "width",adjust=1)+
  geom_boxplot(width=0.5,color="blue",outlier.color = NA,size=0.2,
               position = position_dodge(0.6))+
  #ggrepel::geom_label_repel(data=df_plot_infinite,label="ere",position = position_dodge(1))+
  geom_jitter(alpha=0.4,size=0.4,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.6))+
  #geom_point(data=df_plot%>%group_by(variable,Group)%>%summarise(mean=mean(pval)),
  #          aes(x=variable,y=mean),size=2,color="white",position = position_dodge(1))+
   theme_cowplot()+
  geom_hline(yintercept = 2,color="red",lty=3)+
  theme(axis.text.x = element_text(angle=20,hjust=1))+
  ylab(expression(paste("-",log[10],"(",p,".",value,")")))+
  scale_fill_manual(values=Methods_color)+
  xlab("")+
  ggtitle("LVGs (38 genes)")+
  theme(legend.title = element_blank(),legend.position = "none",plot.title = element_text(hjust=0.5))
  #coord_cartesian(ylim=c(0,20))
```

```{r,fig.width=4.5,fig.height=12}
p_box=plot_grid(p1,p2,ncol=1,align = "v")
p_box
```

- the median value for each method in above figure

```{r}
subset(df_plot, variable=="Overall (Chisq)" &!Group %in%"Raw")%>%
  group_by(genetype,Group)%>%
  summarise(log_pval_median=median(pval_log10),
           logp_mean=mean(pval_log10),
           n=n())
```


## Figure 6

```{r,fig.width=18,fig.height=18}
width=18
height=20
p6=ggdraw()+draw_plot(p_heatmap,x = 0,y = 1-8/height,height = 8/height,width = 1)+
  draw_plot(p_box,x=0,y=0.1/height,height = 11.8/height,width=4.5/width)+
  draw_plot(pp0,x=4.5/width,y=0,height = 11.7/height,width = 13.5/width)+
  draw_label("a", x=0,y=1,hjust=0,vjust = 1,size=30)+
  draw_label("b", x=0.5,y=1,hjust=0,vjust = 1,size=30)+
  draw_label("c", x=0,y=12/height,hjust=0,vjust = 1,size=30)+
  draw_label("d", x=4.5/width,y=12/height,hjust=0,vjust = 1,size=30)+
  draw_label("CarDEC", x=(4.5+13.5/10*1+0.2)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")+
  draw_label("Scanorama", x=(4.5+13.5/10*3-0.1)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")+
  draw_label("DCA", x=(4.5+13.5/10*5-0.27)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")+
  draw_label("scVI", x=(4.5+13.5/10*7-0.5)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")+
  draw_label("MNN",x=(4.5+13.5/10*9-0.68)/width,y=11.9/height,hjust=0.5,vjust = 1,size=18,fontface = "bold")
p6
```

```{r}
#ggsave(filename = "./revised_figures/Figure7.pdf",p6,width=18,height = 18)
#ggsave(filename = "./revised_figures/Figure7.tiff",p6,width=18,height = 18,dpi=300,compression="lzw")
```


# How do we get pval in above Fig.c

We performed hypothesis tests relating to two fitted gam objects here by two ways. 

1. Regarding `BatchID` as predictor, and then we performed
    - Model1: `gam(gene~s(pseudotime,cs="bs"))`
    - Model2: `gam(gene~BatchID+s(pseudotime,cs="bs"))`
    - Using `Chisq test` and `F test` to compare the difference between Model1 and Model2.   
2. Regarding `BatchID` as predictor，but we compared pairwise among T1 vs T2, T1 Vs T3 and  T2 vs T3, separately. 

So we have 8 p-value for each gene. Because pseudotime can inferenced by denoised gene expression from CarDEC or raw gene expression, we maybe have two results:   

- Using the pseudotime inferenced by CarDEC, we compared the difference between denoised gene expression and raw gene expression.  
- Using the pseudotime inferenced by CarDEC and raw gene expression, we compared the difference between denoised gene expression and raw gene expression. 

```{r}
sessionInfo()
```



